# === Replication data for Bureaucratic Responsiveness in Humanitarian Aid, Appendix A and B === #
## This is the R code used for the data analyses presented in Appendix A and Appendix B ##
## The data here is intended to replicate figures and tables derived from the LDA topic model ##
## These include Table A3, Table A4, and Figures B1-B5 ##

#load environment
library(dplyr)
library(tidyverse)
library(quanteda)
library(quanteda.textstats)
library(ggplot2)
library(topicmodels)
library(ldatuning)
library(doParallel)
library(textmineR)
library(wesanderson)
library(tidytext)
library(gridExtra)
library(ape)
library(scales)

#load document term matrix tm_col
load(file = "tm_col_DTM.rds")

#calculate optimum k for topic model using ldatuning
# == NOTE: This will take several days of computing time and requires a large amount of memory == #
# == If you would like to skip this step, you can load the completed results below. == #
# == If you would like to complete this step please remove the hashtags surrounding the code == #

#tunes <- FindTopicsNumber(
#tm_col,
#topics = c(1:10 * 10, 120, 140, 160, 180, 0:5 * 50 + 200),
#metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010"),
#method = "Gibbs",
#control = list(seed = 77),
#mc.cores = 4L,
#verbose = TRUE) 

#load the completed tunes
load("tunes_topicnumbers.rds")

# === Figure B1. A depiction of topic model tuning === #
FindTopicsNumber_plot(tunes)

#Evaluate potential topic models using a 5-fold cross-validation of perplexity. 
# == NOTE: This will take several hours of computing time and requires a large amount of memory == #
# == If you would like to skip this step, you can load the completed results below. == #
# == If you would like to complete this step please remove the hashtags surrounding the code == #

##set possible values for k
#candidate_k <- c(10, 20, 50, 100, 0:5 * 50 + 200)

#cluster <- makeCluster(detectCores(logical = TRUE) - 1) # leave one CPU spare...
#registerDoParallel(cluster)

#clusterEvalQ(cluster, {
#  library(topicmodels)
#})

#burnin = 1000
#iter = 1000
#keep = 50
#n <- nrow(tm_col)
#folds <- 5 
#splitfolds <- sample(1:folds, n, replace = TRUE)
#clusterExport(cluster, c("tm_col", "burnin", "iter", "keep", "splitfolds", "folds", "candidate_k"))

#set.seed(1122)
#system.time({
#  val_results_3 <- foreach(j = 1:length(candidate_k), .combine = rbind) %dopar%{
#    k <- candidate_k[j]
#    results_1k <- matrix(0, nrow = folds, ncol = 2)
#    colnames(results_1k) <- c("k", "perplexity")
#    for(i in 1:folds){
#      train_set <- tm_col[splitfolds != i , ]
#      test_set <- tm_col[splitfolds == i, ]
#      
#      fitted <- LDA(train_set, k = k, method = "Gibbs",
#                    control = list(burnin = burnin, iter = iter, keep = keep) )
#      results_1k[i,] <- c(k, perplexity(fitted, newdata = test_set))
#    }
#    return(results_1k)
#  }
#})
#stopCluster(cluster)

#load the completed cross-validation results
load(file = "perplexity_results.rds")

# === Figure B2. A depiction of perplexity scores for topic model === #
ggplot(perp10_400, aes(x = k, y = perplexity)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  labs(x = "Number of topics", y = "Perplexity")

#create the topic model
# == NOTE: This will take several hours of computing time and requires a large amount of memory == #
# == If you would like to skip this step, you can load the completed results below. == #
# == If you would like to complete this step please remove the hashtags surrounding the code == #

#require(topicmodels)
#K <- 250
#topicModel <- LDA(tm_col, K,
#                  method = "Gibbs",
#                  control=list(seed=1234), 
#                  alpha = 0.1)


#load the completed model
load(file = "LDAtopicmodel_k250.rds")

#cluster topics using hellinger distance
topic_linguistic_dist <- CalcHellingerDist(topicModel@beta)
hclust <- hclust(as.dist(topic_linguistic_dist), "ward.D")

phylo <- as.phylo(hclust)
plot(as.phylo(hclust), type = "unrooted", cex = 0.5,
     lab4ut = "axial",
     no.margin = TRUE)

#identify top terms
top_terms <- terms(topicModel, 5)  

top_terms_df <- data.frame(
  topic = seq_len(ncol(top_terms)),  # Ensures integer topic index (1, 2, 3, ...)
  top_terms = apply(top_terms, 2, function(x) paste(x, collapse = ", ")))


#define clusters using the phylogenetic branches
clusters <- list(
  c1 = as.integer(c("120", "218", "85", "98", "249", "115", "122", "143", "171", "69", "72", "4", "159", "103", "156", "8", "194", "76")),
  c2 = as.integer(c("56", "183", "27", "87", "139", "222", "5", "154", "33", "111", "181", "246", "16", "19", "74", "176", "34", "90")),
  c3 = as.integer(c("93", "211", "42", "133", "28", "31", "41", "96", "92", "12", "182")),
  c4 = as.integer(c("119", "242", "52", "26", "39", "64", "248", "79", "54", "77", "91", "179", "94", "124")),
  c5 = as.integer(c("202", "230", "175", "234", "190", "200", "62")),
  c6 = as.integer(c("86", "22", "58", "100", "2")),
  c7 = as.integer(c("197", "53", "206", "21", "99", "84")),
  c8 = as.integer(c("109", "157", "108", "239", "214", "164", "174", "195", "153")),
  c9 = as.integer(c("144", "102", "148", "65", "204", "59", "25", "228", "61", "49", "29", "127", "245", "128", "198", "106", "15", "235", "141", "35", "113", "110", "36", "24", "201", "238", "227", "117")),
  c10 = as.integer(c("130", "125", "226", "203", "78", "250", "215", "138", "67", "142", "97", "217", "172", "150", "208", "129", "212", "23", "229", "6", "225", "132")),
  c11 = as.integer(c("220", "145", "186", "160", "38", "178", "75", "131", "168", "83", "73", "30")),
  c12 = as.integer(c("196", "213", "68", "223", "184", "17", "210", "14", "192", "7", "244", "82", "240", "193", "152", "71", "9", "167", "219", "101", "80", "233", "224", "18", "123", "45", "20", "237", "216", "50", "13", "207", "89")),
  c13 = as.integer(c("158", "114", "51", "1", "170", "95", "63")),
  c14 = as.integer(c("205", "166", "11", "161", "81", "231", "149")),
  c15 = as.integer(c("10", "66", "57", "70", "116", "236", "37", "60", "40", "55", "32", "169", "126", "180", "47", "185", "191", "189", "107", "140", "121", "187")),
  c16 = as.integer(c("147", "177", "48", "232", "188", "155", "163", "137", "88", "105", "173", "146", "162", "46", "243", "199", "3", "44", "104", "209", "151", "43", "135", "221", "118", "136", "241", "112", "165", "134", "247"))
)

#create a data frame for clusters
clustered_topics <- do.call(rbind, lapply(seq_along(clusters), function(i) {
  data.frame(
    cluster = paste0("Cluster_", i),
    topic = clusters[[i]]
  )
}))

clustered_topics_with_terms <- merge(clustered_topics, top_terms_df, by = "topic", all.x = TRUE)

\clustered_topics_with_terms <- clustered_topics_with_terms[order(clustered_topics_with_terms$cluster, clustered_topics_with_terms$topic), ]

# === Table A3 - A table containing the topics identified === #
print(clustered_topics_with_terms)



#load main data set
data <- read.csv(file = "maindata_fpa_replication.csv")

data <- data %>%
  mutate(case = recode(case,
                       "Afghan" = "Afghanistan",
                       "Gaza" = "Gaza/West Bank"))

df <- data %>%
  group_by(case) %>%
  mutate(avg_words = mean(med_words)) %>%
  ungroup() %>%
  select(case, avg_words) %>%
  pivot_longer(cols = -case, names_to = "variable", values_to = "value") %>%
  unique()

# === Figure B3 - A depiction of the average annual word count per case === #
ggplot(df, aes(fill=case, x=reorder(case, value), y = value)) +
  geom_col() +
  theme(legend.position = "none") +
  theme(axis.text = element_text(angle=90)) + 
  scale_fill_manual(values = 
                      wes_palette(n = 53, "Chevalier1", type = "continuous")) +
  xlab("Country") +
  ylab("Average Annual NYT Word Count")


df <- data %>%
  group_by(case) %>%
  mutate(avg_med_h = mean(med_h)) %>%
  ungroup() %>%
  select(case, avg_med_h) %>%
  gather(key = "variable", value = "value", -case) %>%
  unique()

# === Figure B4 - A depiction of the average annual media diversity per case === #
ggplot(df, aes(fill=case, x=reorder(case, value), y = value)) +
  geom_col() +
  theme(legend.position = "none") +
  theme(axis.text = element_text(angle=90)) + 
  scale_fill_manual(values = 
                      wes_palette(n = 53, "Chevalier1", type = "continuous")) +
  xlab("Country") +
  ylab("Average Annual Media Entropy") + 
  coord_cartesian(ylim = c(0.5, 0.95))


#load original articles to extract document word counts
raw_articles <- read.csv(file = "raw_articles.csv", fileEncoding = "UTF-16LE")

raw_articles <- data.frame(raw_articles, stringsAsFactors = FALSE)

#convert to a quanteda corpus and extract docvars
corp <- corpus(raw_articles, text_field = "Article")

doc_info <- docvars(corp) %>%
  mutate(document = docid(corp))

#clean up the labels in doc_info
doc_info <- doc_info %>%
  mutate(filename = str_remove(basename(Source_File), "\\.docx$")) %>%
  mutate(
    year = str_extract(filename, "\\d{4}(?:_\\d+)?$"),               
    case = str_remove(filename, " \\d{4}(?:_\\d+)?$")                
  )

doc_info <- doc_info %>%
  mutate(case = case_when(
    case == "West Bank" ~ "Gaza",
    TRUE ~ case
  ))

#tidy the topic model
tidy_docs <- tidy(topicModel, matrix = "gamma")

#join to doc_info
tidy_docs <- tidy_docs %>%
  left_join(doc_info %>% select(case, year, document, Length), by = "document")

#use gamma values to calculate topic-focused coverage
tidy_docs <- tidy_docs %>%
  mutate(topic_words = gamma * Length)

topic_word_summary <- tidy_docs %>%
  group_by(case, year, topic) %>%
  summarise(total_topic_words = sum(topic_words, na.rm = TRUE),
            .groups = "drop")

#set the Hellinger distance-derived topic clusters
c1 <- as.integer(c("120", "218", "85", "98", "249", "115", "122", "143", "171", "69", "72", "4", "159", "103", "156", "8", "194", "76"))
c2 <- as.integer(c("56", "183", "27", "87", "139", "222", "5", "154", "33", "111", "181", "246", "16", "19", "74", "176", "34", "90"))
c3 <- as.integer(c("93", "211", "42", "133", "28", "31", "41", "96", "92", "12", "182"))
c4 <- as.integer(c("119", "242", "52", "26", "39", "64", "248", "79", "54", "77", "91", "179", "94", "124"))
c5 <- as.integer(c("202", "230", "175", "234", "190", "200", "62"))
c6 <- as.integer(c("86", "22", "58", "100", "2"))
c7 <- as.integer(c("197", "53", "206", "21", "99", "84"))
c8 <- as.integer(c("109", "157", "108", "239", "214", "164", "174", "195", "153"))
c9 <- as.integer(c("144", "102", "148", "65", "204", "59", "25", "228", "61", "49", "29", "127", "245", "128", "198", "106", "15", "235", "141", "35", "113", "110", "36", "24", "201", "238", "227", "117"))
c10 <- as.integer(c("130", "125", "226", "203", "78", "250", "215", "138", "67", "142", "97", "217", "172", "150", "208", "129", "212", "23", "229", "6", "225", "132"))
c11 <- as.integer(c("220", "145", "186", "160", "38", "178", "75", "131", "168", "83", "73", "30"))
c12 <- as.integer(c("196", "213", "68", "223", "184", "17", "210", "14", "192", "7", "244", "82", "240", "193", "152", "71", "9", "167", "219", "101", "80", "233", "224", "18", "123", "45", "20", "237", "216", "50", "13", "207", "89"))
c13 <- as.integer(c("158", "114", "51", "1", "170", "95", "63"))
c14 <- as.integer(c("205", "166", "11", "161", "81", "231", "149"))
c15 <- as.integer(c("10", "66", "57", "70", "116", "236", "37", "60", "40", "55", "32", "169", "126", "180", "47", "185", "191", "189", "107", "140", "121", "187"))
c16 <- as.integer(c("147", "177", "48", "232", "188", "155", "163", "137", "88", "105", "173", "146", "162", "46", "243", "199", "3", "44", "104", "209", "151", "43", "135", "221", "118", "136", "241", "112", "165", "134", "247"))

#combine into list
topic_clusters <- list(c1=c1, c2=c2, c3=c3, c4=c4, c5=c5, c6=c6, c7=c7, c8=c8,
                       c9=c9, c10=c10, c11=c11, c12=c12, c13=c13, c14=c14, c15=c15, c16=c16)

#convert to data frame
topic_cluster_df <- bind_rows(
  lapply(names(topic_clusters), function(cluster_name) {
    data.frame(topic = topic_clusters[[cluster_name]], cluster = cluster_name)
  })
)

#calculate volume of coverage by cluster
topic_word_summary <- topic_word_summary %>%
  mutate(topic = as.integer(topic)) %>% 
  left_join(topic_cluster_df, by = "topic")

cluster_word_summary <- topic_word_summary %>%
  group_by(case, year, cluster) %>%
  summarise(total_cluster_words = sum(total_topic_words, na.rm = TRUE),
            .groups = "drop")

#read in main data set and join
data <- read.csv(file = "maindata_fpa_replication.csv")

data <- data %>%
  mutate(case = case_when(
    case == "Afghan" ~ "Afghanistan",
    TRUE ~ case
  ))

data <- data %>%
  mutate(year = substr(as.character(year), 1, 4))

joined_data <- cluster_word_summary %>%
  right_join(data, by = c("case", "year"))

#drop cases receiving no media attention whatsoever
joined_data <- joined_data %>%
  filter(med_words != 0) %>%
  filter(med_words != 0.1)

#reorder by word counts and add quartile labels for visualization
joined_data$total_cluster_words[is.na(joined_data$total_cluster_words)] <- 0
joined_data$cluster <- reorder(joined_data$cluster, joined_data$total_cluster_words)

joined_data <- joined_data %>%
  mutate(
    med_words_quartile = case_when(
      med_words >= quantile(med_words, 0.75, na.rm = TRUE) ~ "Top 25%",
      med_words <= quantile(med_words, 0.25, na.rm = TRUE) ~ "Bottom 25%",
      TRUE ~ "Middle 50%"
    ),
    med_h_quartile = case_when(
      med_h >= quantile(med_h, 0.75, na.rm = TRUE) ~ "Top 25%",
      med_h <= quantile(med_h, 0.25, na.rm = TRUE) ~ "Bottom 25%",
      TRUE ~ "Middle 50%"
    )
  )

#create data frames for visualization
#drop junk topics for visualization
bottom_volume <- joined_data %>%
  filter(med_words_quartile == "Bottom 25%") %>%
  group_by(cluster) %>%
  summarize(c_words = sum(total_cluster_words, na.rm = TRUE), .groups = "drop") %>%
  mutate(
    total_words = sum(c_words),
    percent = c_words / total_words
  ) %>%
  filter(!cluster %in% c("c1", "c13", "c14", "c15", "c16"))

top_volume <- joined_data %>%
  filter(med_words_quartile == "Top 25%") %>%
  group_by(cluster) %>%
  summarize(c_words = sum(total_cluster_words, na.rm = TRUE), .groups = "drop") %>%
  mutate(
    total_words = sum(c_words),
    percent = c_words / total_words
  ) %>%
  filter(!cluster %in% c("c1", "c13", "c14", "c15", "c16"))

bottom_diversity <- joined_data %>%
  filter(med_h_quartile == "Bottom 25%") %>%
  group_by(cluster) %>%
  summarize(c_words = sum(total_cluster_words, na.rm = TRUE), .groups = "drop") %>%
  mutate(
    total_words = sum(c_words),
    percent = c_words / total_words
  ) %>%
  filter(!cluster %in% c("c1", "c13", "c14", "c15", "c16"))

top_diversity <- joined_data %>%
  filter(med_h_quartile == "Top 25%") %>%
  group_by(cluster) %>%
  summarize(c_words = sum(total_cluster_words, na.rm = TRUE), .groups = "drop") %>%
  mutate(
    total_words = sum(c_words),
    percent = c_words / total_words
  ) %>%
  filter(!cluster %in% c("c1", "c13", "c14", "c15", "c16"))


#determine the max percent value across all datasets
max_percent <- max(
  bottom_volume$percent,
  top_volume$percent,
  bottom_diversity$percent,
  top_diversity$percent
)

#create bar graphs
bar1 <- ggplot(bottom_volume, aes(x = cluster, y = percent, fill = cluster)) +
  geom_col(width = 1, color = "black") +
  theme(
    axis.ticks = element_blank(),
    axis.title = element_blank(),
    axis.text = element_text(size = 15),
    axis.text.y = element_text(angle = 90),
    legend.position = "none",
    panel.background = element_rect(fill = "white")
  ) +
  scale_fill_manual(
    values = wes_palette(n = length(unique(bottom_volume$cluster)), name = "Chevalier1", type = "continuous")
  ) +
  scale_y_continuous(
    labels = percent_format(accuracy = 1),
    limits = c(0, max_percent)
  )

bar2 <- ggplot(top_volume, aes(x = cluster, y = percent, fill = cluster)) +
  geom_col(width = 1, color = "black") +
  theme(
    axis.ticks = element_blank(),
    axis.title = element_blank(),
    axis.text = element_text(size = 15),
    axis.text.y = element_blank(),
    legend.position = "none",
    panel.background = element_rect(fill = "white")
  ) +
  scale_fill_manual(
    values = wes_palette(n = length(unique(top_volume$cluster)), name = "Chevalier1", type = "continuous")
  ) +
  scale_y_continuous(
    labels = percent_format(accuracy = 1),
    limits = c(0, max_percent)
  )

bar3 <- ggplot(bottom_diversity, aes(x = cluster, y = percent, fill = cluster)) +
  geom_col(width = 1, color = "black") +
  theme(
    axis.ticks = element_blank(),
    axis.title = element_blank(),
    axis.text = element_text(size = 15),
    axis.text.y = element_text(angle = 90),
    legend.position = "none",
    panel.background = element_rect(fill = "white")
  ) +
  scale_fill_manual(
    values = wes_palette(n = length(unique(bottom_diversity$cluster)), name = "Chevalier1", type = "continuous")
  ) +
  scale_y_continuous(
    labels = percent_format(accuracy = 1),
    limits = c(0, max_percent)
  )

bar4 <- ggplot(top_diversity, aes(x = cluster, y = percent, fill = cluster)) +
  geom_col(width = 1, color = "black") +
  theme(
    axis.ticks = element_blank(),
    axis.title = element_blank(),
    axis.text = element_text(size = 15),
    axis.text.y = element_blank(),
    legend.position = "none",
    panel.background = element_rect(fill = "white")
  ) +
  scale_fill_manual(
    values = wes_palette(n = length(unique(top_diversity$cluster)), name = "Chevalier1", type = "continuous")
  ) +
  scale_y_continuous(
    labels = percent_format(accuracy = 1),
    limits = c(0, max_percent)
  )

# === Figure B4 === #
grid.arrange(bar1, bar2, bar3, bar4)